• DOMAIN: Automobile
• CONTEXT: The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette.
The vehicle may be viewed from one of many different angles.
• DATA DESCRIPTION: The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars
• PROJECT OBJECTIVE: Apply dimensionality reduction technique – PCA and train a model using principal components instead of training the model using just the raw data.
2.EDA and visualisation: Create a detailed performance report using univariate, bi-variate and multivariate EDA techniques. Find out all possible hidden patterns by using all possible methods.
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
df=pd.read_csv("C:/Users/HP/Downloads/vehicle-1 (1).csv")
df
df["class"].unique()
df.describe().transpose()
df.isnull().sum()
df = df.replace('?', np.nan)
df.shape
df[df.isnull().any(axis=1)]
df.median()
df_var=df.drop('class',axis=1)
df_var=df_var.apply(lambda x: x.fillna(x.median()),axis=0)
Since there are two types are cars sub-groups within the CARS class type we are going to use clustering to extract them
filt=(df['class']!='car')
df_others=df.loc[filt]
filt=(df['class']=='car')
df_car=df.loc[filt]
df_car_var=df_car.drop('class',axis=1)
df_car_var=df_car_var.apply(lambda x: x.fillna(x.median()),axis=0)
from scipy.stats import zscore
df_car_var_z = df_car_var.apply(zscore)
X=df_car_var_z
X
from sklearn.cluster import KMeans
from scipy.spatial import distance
# Let us check optimal number of clusters-
distortion = []
cluster_range = range( 1, 10) # expect 3 to four clusters from the pair panel visual inspection hence restricting from 2 to 6
cluster_errors = []
cluster_sil_scores = []
for num_clusters in cluster_range:
clusters = KMeans( num_clusters, n_init = 5)
clusters.fit(X)
centroids = clusters.cluster_centers_
cluster_errors.append( clusters.inertia_ )
distortion.append(sum(np.min(distance.cdist(X, clusters.cluster_centers_, 'euclidean'), axis=1))/ X.shape[0])
clusters_df = pd.DataFrame( { "num_clusters":cluster_range, "cluster_errors": cluster_errors} )
clusters_df[0:15]
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
Since from the above chart, we can see there are clearly two dominant clusters present hence the two type of cars
cluster = KMeans( n_clusters = 2, random_state = 2354 )
cluster.fit(X)
prediction= cluster.predict(X)
df_car["group"] = prediction
df_car
df=pd.concat([df_others,df_car],axis=0,ignore_index=True)
df
df['group'] = df['group'].replace(np.nan, 0)
df
def Corgie_name(col):
Class=col[0]
group=col[1]
if(Class=='van'):
return 'Cheverolet_van'
elif(Class=='bus'):
return 'Double_decker_bus'
elif(Class=='car'):
if group==1:
return 'Saab_9000'
else:
return 'Opel_Manta_400'
df['class']=df[['class','group']].apply(Corgie_name,axis=1)
df
df['class'].unique()
There are few missing values present in each feature so lets impute them
df.isnull().sum()
df.median()
df_var=df.drop(['class','group'],axis=1)
df_var=df_var.apply(lambda x: x.fillna(x.median()),axis=0)
df_var['class']=df['class']
df_var
sns.pairplot(df_var, diag_kind='kde', hue = 'class')
Using a Plain SVM model on the processed data, without the help of PCA
target = df_var["class"]
features = df_var.drop(["class"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(features,target, test_size = 0.3, random_state = 43)
X_train_z=sc.fit_transform(X_train)
X_test_z=sc.transform(X_test)
svc_model = SVC(C= .1, kernel='linear', gamma= 1)
svc_model.fit(X_train_z, y_train)
prediction = svc_model .predict(X_test_z)
print(svc_model.score(X_train_z, y_train))
print(svc_model.score(X_test_z, y_test))
from sklearn.metrics import accuracy_score, confusion_matrix
print("Confusion Matrix:\n",confusion_matrix(prediction,y_test))
Using several kernel tricks to find out which gives the highest performance
svc_model = SVC(kernel='poly')
svc_model.fit(X_train_z, y_train)
prediction = svc_model.predict(X_test_z)
print(svc_model.score(X_train_z, y_train))
print(svc_model.score(X_test_z, y_test))
svc_model = SVC(kernel='sigmoid')
svc_model.fit(X_train_z, y_train)
prediction = svc_model.predict(X_test_z)
print(svc_model.score(X_train_z, y_train))
print(svc_model.score(X_test_z, y_test))
svc_model = SVC(kernel='rbf')
svc_model.fit(X_train_z, y_train)
prediction = svc_model.predict(X_test_z)
print(svc_model.score(X_train_z, y_train))
print(svc_model.score(X_test_z, y_test))
We can infer from the metrics that the RBF method gives the best results
from sklearn.metrics import accuracy_score, confusion_matrix
print("Confusion Matrix:\n",confusion_matrix(prediction,y_test))
from scipy.stats import zscore
df_var_f = df_var.drop('class',axis=1)
df_var_z = df_var_f.apply(zscore)
array = df_var_z.values
array
Using a SVM model on the processed data, with the help of PCA
from sklearn.decomposition import PCA
pca = PCA(10)
projected = pca.fit_transform(array)
print(projected.shape)
X_train, X_test, y_train, y_test = train_test_split(projected,target, test_size = 0.3, random_state = 43)
X_train_z=sc.fit_transform(X_train)
X_test_z=sc.transform(X_test)
svc_model = SVC(C= .1, kernel='linear', gamma= 1)
svc_model.fit(X_train, y_train)
prediction = svc_model .predict(X_test)
print(svc_model.score(X_train, y_train))
print(svc_model.score(X_test, y_test))
X_train, X_test, y_train, y_test = train_test_split(projected,target, test_size = 0.3, random_state = 43)
svc_model = SVC(kernel='rbf')
svc_model.fit(X_train, y_train)
prediction = svc_model.predict(X_test)
print(svc_model.score(X_train, y_train))
print(svc_model.score(X_test, y_test))
from sklearn.metrics import accuracy_score, confusion_matrix
print("Confusion Matrix:\n",confusion_matrix(prediction,y_test))
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 150, learning_rate = 0.05)
gbcl.fit(X_train, y_train)
print("Training Score")
print(gbcl.score(X_train , y_train))
print("Testing Score")
print(gbcl.score(X_test , y_test))
from sklearn.metrics import accuracy_score, confusion_matrix
print("Confusion Matrix:\n",confusion_matrix(prediction,y_test))